In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
In [2]:
df=pd.read_csv('airbnb.csv')
In [3]:
df.head()
Out[3]:
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
In [4]:
df.describe()
Out[4]:
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.889500e+04 | 4.889500e+04 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 38843.000000 | 48895.000000 | 48895.000000 |
| mean | 1.901714e+07 | 6.762001e+07 | 40.728949 | -73.952170 | 152.720687 | 7.029962 | 23.274466 | 1.373221 | 7.143982 | 112.781327 |
| std | 1.098311e+07 | 7.861097e+07 | 0.054530 | 0.046157 | 240.154170 | 20.510550 | 44.550582 | 1.680442 | 32.952519 | 131.622289 |
| min | 2.539000e+03 | 2.438000e+03 | 40.499790 | -74.244420 | 0.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 |
| 25% | 9.471945e+06 | 7.822033e+06 | 40.690100 | -73.983070 | 69.000000 | 1.000000 | 1.000000 | 0.190000 | 1.000000 | 0.000000 |
| 50% | 1.967728e+07 | 3.079382e+07 | 40.723070 | -73.955680 | 106.000000 | 3.000000 | 5.000000 | 0.720000 | 1.000000 | 45.000000 |
| 75% | 2.915218e+07 | 1.074344e+08 | 40.763115 | -73.936275 | 175.000000 | 5.000000 | 24.000000 | 2.020000 | 2.000000 | 227.000000 |
| max | 3.648724e+07 | 2.743213e+08 | 40.913060 | -73.712990 | 10000.000000 | 1250.000000 | 629.000000 | 58.500000 | 327.000000 | 365.000000 |
In [5]:
df.dtypes
Out[5]:
id int64 name object host_id int64 host_name object neighbourhood_group object neighbourhood object latitude float64 longitude float64 room_type object price int64 minimum_nights int64 number_of_reviews int64 last_review object reviews_per_month float64 calculated_host_listings_count int64 availability_365 int64 dtype: object
In [6]:
df.shape
Out[6]:
(48895, 16)
In [7]:
## to count the number of missing values-
missing_count=df.isnull().sum()
print(missing_count)
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
In [8]:
df.duplicated().sum()
Out[8]:
0
In [9]:
df.isna().sum()
Out[9]:
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
EDA¶
In [10]:
columns_with_missing = ['last_review', 'reviews_per_month']
# Filter rows where these columns have missing values
missing_values_df = df[df[columns_with_missing].isnull().any(axis=1)]
# Display a glimpse of the rows with missing values
print(missing_values_df[columns_with_missing])
last_review reviews_per_month 2 NaN NaN 19 NaN NaN 26 NaN NaN 36 NaN NaN 38 NaN NaN ... ... ... 48890 NaN NaN 48891 NaN NaN 48892 NaN NaN 48893 NaN NaN 48894 NaN NaN [10052 rows x 2 columns]
In [11]:
# this can help us in determining how to fill the missing values
unique_dates = df['last_review'].unique()
print(unique_dates)
#######
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
max_year = df['last_review'].dt.year.max()
min_year = df['last_review'].dt.year.min()
print(f"Max Year: {max_year}")
print(f"Min Year: {min_year}")
['2018-10-19' '2019-05-21' nan ... '2017-12-23' '2018-01-29' '2018-03-29'] Max Year: 2019.0 Min Year: 2011.0
In [12]:
#clean the 'last_review' column
df['last_review'] = pd.to_datetime(df['last_review'])
min_date= df['last_review'].min()
max_date= df['last_review'].max()
all_dates = pd.date_range(min_date, max_date, freq='D')
In [13]:
df['last_review'] = pd.to_datetime(df['last_review'])
df['Year'] = df['last_review'].dt.year
sns.relplot(x='Year', y='reviews_per_month', data=df, kind="line", ci=None)
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\axisgrid.py:848: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.
func(*plot_args, **plot_kwargs)
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x1660298d310>
In [14]:
#drop the column Year because we don't need it anymore
# Assuming 'Year' is the column you want to drop
df.drop(columns='Year', inplace=True)
In [15]:
# no clear trends!
# so i can fill the missing values of column last_review: mean/mode OR Forward/Backward Fill
# i will choose mean, because I will not sort the dataframe by dates and its not a time series application.
mean_last_rev = df['last_review'].mean()
df['last_review'].fillna(value=mean_last_rev, inplace=True)
In [16]:
#understand the distribution first
plt.hist(df['reviews_per_month'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of reviews_per_month')
plt.xlabel('reviews_per_month')
plt.ylabel('Frequency')
plt.show()
# the distribution is right-skewed
In [17]:
mean_value = df['reviews_per_month'].mean()
median_value = df['reviews_per_month'].median()
mode_value = df['reviews_per_month'].mode().iloc[0]
std_dev = df['reviews_per_month'].std()
print(f"Mean: {mean_value}, Median: {median_value}, Mode: {mode_value}, Std Dev: {std_dev}")
Mean: 1.3732214298586618, Median: 0.72, Mode: 0.02, Std Dev: 1.6804419952744627
In [18]:
# I will fill the NaN values by median because of the distribution
df['reviews_per_month'].fillna(value=df['reviews_per_month'].median(), inplace=True)
In [19]:
# Dropping rows with missing values in 'name' and 'host_name' due to small percentage of data
df.dropna(subset=['name', 'host_name'], inplace=True)
In [20]:
df
Out[20]:
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 00:00:00.000000000 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 00:00:00.000000000 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 00:00:00.000000000 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 00:00:00.000000000 | 0.10 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | 2018-10-04 01:47:23.910099456 | 0.72 | 1 | 23 |
48858 rows × 16 columns
In [21]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 48858 entries, 0 to 48894 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48858 non-null int64 1 name 48858 non-null object 2 host_id 48858 non-null int64 3 host_name 48858 non-null object 4 neighbourhood_group 48858 non-null object 5 neighbourhood 48858 non-null object 6 latitude 48858 non-null float64 7 longitude 48858 non-null float64 8 room_type 48858 non-null object 9 price 48858 non-null int64 10 minimum_nights 48858 non-null int64 11 number_of_reviews 48858 non-null int64 12 last_review 48858 non-null datetime64[ns] 13 reviews_per_month 48858 non-null float64 14 calculated_host_listings_count 48858 non-null int64 15 availability_365 48858 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(7), object(5) memory usage: 6.3+ MB
In [22]:
# to check the unique values for neighourhood group and room type
print(df['neighbourhood_group'].unique())
print(df['room_type'].unique())
['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx'] ['Private room' 'Entire home/apt' 'Shared room']
In [23]:
#to check unique values for the the neighourhood
df['neighbourhood'].unique()
Out[23]:
array(['Kensington', 'Midtown', 'Harlem', 'Clinton Hill', 'East Harlem',
'Murray Hill', 'Bedford-Stuyvesant', "Hell's Kitchen",
'Upper West Side', 'Chinatown', 'South Slope', 'West Village',
'Williamsburg', 'Fort Greene', 'Chelsea', 'Crown Heights',
'Park Slope', 'Windsor Terrace', 'Inwood', 'East Village',
'Greenpoint', 'Bushwick', 'Flatbush', 'Lower East Side',
'Prospect-Lefferts Gardens', 'Long Island City', 'Kips Bay',
'SoHo', 'Upper East Side', 'Prospect Heights',
'Washington Heights', 'Woodside', 'Brooklyn Heights',
'Carroll Gardens', 'Gowanus', 'Flatlands', 'Cobble Hill',
'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO', 'St. George',
'Highbridge', 'Financial District', 'Ridgewood',
'Morningside Heights', 'Jamaica', 'Middle Village', 'NoHo',
'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
'Greenwich Village', 'Little Italy', 'East Flatbush',
'Tompkinsville', 'Astoria', 'Clason Point', 'Eastchester',
'Kingsbridge', 'Two Bridges', 'Rockaway Beach', 'Forest Hills',
'Nolita', 'Woodlawn', 'University Heights', 'Gravesend',
'Gramercy', 'Allerton', 'East New York', 'Theater District',
'Concourse Village', 'Sheepshead Bay', 'Emerson Hill',
'Fort Hamilton', 'Bensonhurst', 'Tribeca', 'Shore Acres',
'Sunset Park', 'Concourse', 'Elmhurst', 'Brighton Beach',
'Jackson Heights', 'Cypress Hills', 'St. Albans', 'Arrochar',
'Rego Park', 'Wakefield', 'Clifton', 'Bay Ridge', 'Graniteville',
'Spuyten Duyvil', 'Stapleton', 'Briarwood', 'Ozone Park',
'Columbia St', 'Vinegar Hill', 'Mott Haven', 'Longwood',
'Canarsie', 'Battery Park City', 'Civic Center', 'East Elmhurst',
'New Springville', 'Morris Heights', 'Arverne', 'Cambria Heights',
'Tottenville', 'Mariners Harbor', 'Concord', 'Borough Park',
'Bayside', 'Downtown Brooklyn', 'Port Morris', 'Fieldston',
'Kew Gardens', 'Midwood', 'College Point', 'Mount Eden',
'City Island', 'Glendale', 'Port Richmond', 'Red Hook',
'Richmond Hill', 'Queens Village', 'Bellerose', 'Maspeth',
'Williamsbridge', 'Soundview', 'Woodhaven', 'Woodrow',
'Co-op City', 'Stuyvesant Town', 'Parkchester', 'North Riverdale',
'Dyker Heights', 'Bronxdale', 'Sea Gate', 'Riverdale',
'Kew Gardens Hills', 'Bay Terrace', 'Norwood', 'Claremont Village',
'Whitestone', 'Fordham', 'Bayswater', 'Navy Yard', 'Brownsville',
'Eltingville', 'Fresh Meadows', 'Mount Hope', 'Lighthouse Hill',
'Springfield Gardens', 'Howard Beach', 'Belle Harbor',
'Jamaica Estates', 'Van Nest', 'Morris Park', 'West Brighton',
'Far Rockaway', 'South Ozone Park', 'Tremont', 'Corona',
'Great Kills', 'Manhattan Beach', 'Marble Hill', 'Dongan Hills',
'Castleton Corners', 'East Morrisania', 'Hunts Point', 'Neponsit',
'Pelham Bay', 'Randall Manor', 'Throgs Neck', 'Todt Hill',
'West Farms', 'Silver Lake', 'Morrisania', 'Laurelton',
'Grymes Hill', 'Holliswood', 'Pelham Gardens', 'Belmont',
'Rosedale', 'Edgemere', 'New Brighton', 'Midland Beach',
'Baychester', 'Melrose', 'Bergen Beach', 'Richmondtown',
'Howland Hook', 'Schuylerville', 'Coney Island', 'New Dorp Beach',
"Prince's Bay", 'South Beach', 'Bath Beach', 'Jamaica Hills',
'Oakwood', 'Castle Hill', 'Hollis', 'Douglaston', 'Huguenot',
'Olinville', 'Edenwald', 'Grant City', 'Westerleigh',
'Bay Terrace, Staten Island', 'Westchester Square', 'Little Neck',
'Fort Wadsworth', 'Rosebank', 'Unionport', 'Mill Basin',
'Arden Heights', "Bull's Head", 'New Dorp', 'Rossville',
'Breezy Point', 'Willowbrook'], dtype=object)
Visualization¶
In [24]:
sns.countplot(x='neighbourhood_group',data=df,palette='rocket')
plt.title('neighbourhood_group')
Out[24]:
Text(0.5, 1.0, 'neighbourhood_group')
In [25]:
plt.title('room_type')
sns.countplot(x='room_type',data=df,palette='mako')
Out[25]:
<Axes: title={'center': 'room_type'}, xlabel='room_type', ylabel='count'>
In [26]:
plt.figure(figsize=(6,8),dpi=80)
plt.title('neighbourhood_group')
sns.barplot(x='neighbourhood_group',y='number_of_reviews',data=df,palette='muted')
Out[26]:
<Axes: title={'center': 'neighbourhood_group'}, xlabel='neighbourhood_group', ylabel='number_of_reviews'>
In [27]:
plt.figure(figsize=(6,8),dpi=72)
plt.title('room_type')
sns.barplot(x='room_type',y='number_of_reviews',data=df,palette='rocket')
Out[27]:
<Axes: title={'center': 'room_type'}, xlabel='room_type', ylabel='number_of_reviews'>
In [28]:
plt.figure(figsize=(10, 6))
sns.countplot(x='neighbourhood_group', hue='room_type', data=df, palette='Spectral')
plt.title('Neighbourhood Group vs Room Type')
plt.xlabel('Neighbourhood Group')
plt.ylabel('Count')
plt.show()
In [29]:
sns.catplot(x='room_type', y='price', data=df, palette='rocket')
C:\Users\Mayank\AppData\Local\Temp\ipykernel_8536\2459576386.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated.
sns.catplot(x='room_type', y='price', data=df, palette='rocket')
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x1660296e8d0>
In [30]:
import plotly.express as px
# Create a scatter mapbox visualization
fig = px.scatter_mapbox(
df,
lat="latitude",
lon="longitude",
color="room_type",
size="price",
size_max=10,
mapbox_style="carto-positron",
hover_name="neighbourhood_group",
hover_data=["price"],
title="Room Types in NYC"
)
# Show the visualization
fig.show()
univariate analysis¶
In [31]:
#for the numerical data in df
df.describe().T
Out[31]:
| count | mean | min | 25% | 50% | 75% | max | std | |
|---|---|---|---|---|---|---|---|---|
| id | 48858.0 | 19023349.934565 | 2539.0 | 9475979.75 | 19691143.5 | 29157648.25 | 36487245.0 | 10982893.614232 |
| host_id | 48858.0 | 67631688.285951 | 2438.0 | 7818668.75 | 30791331.0 | 107434423.0 | 274321313.0 | 78623888.992733 |
| latitude | 48858.0 | 40.728941 | 40.49979 | 40.69009 | 40.72307 | 40.763107 | 40.91306 | 0.054528 |
| longitude | 48858.0 | -73.95217 | -74.24442 | -73.98307 | -73.95568 | -73.93628 | -73.71299 | 0.046159 |
| price | 48858.0 | 152.740309 | 0.0 | 69.0 | 106.0 | 175.0 | 10000.0 | 240.232386 |
| minimum_nights | 48858.0 | 7.012444 | 1.0 | 1.0 | 3.0 | 5.0 | 1250.0 | 20.019757 |
| number_of_reviews | 48858.0 | 23.273098 | 0.0 | 1.0 | 5.0 | 24.0 | 629.0 | 44.549898 |
| last_review | 48858 | 2018-10-04 06:14:40.804078336 | 2011-03-28 00:00:00 | 2018-10-04 01:47:23.910099456 | 2019-01-03 00:00:00 | 2019-06-19 00:00:00 | 2019-07-08 00:00:00 | NaN |
| reviews_per_month | 48858.0 | 1.239035 | 0.01 | 0.28 | 0.72 | 1.58 | 58.5 | 1.520889 |
| calculated_host_listings_count | 48858.0 | 7.148369 | 1.0 | 1.0 | 1.0 | 2.0 | 327.0 | 32.9646 |
| availability_365 | 48858.0 | 112.801425 | 0.0 | 0.0 | 45.0 | 227.0 | 365.0 | 131.610962 |
we will focus on price in numerical data¶
In [32]:
sns.histplot(data=df['price'])
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
Out[32]:
<Axes: xlabel='price', ylabel='Count'>
In [33]:
sns.boxplot(data=df, y='price', color='lightgreen')
plt.title('Boxplot of Price')
plt.show()
for categorical value¶
In [34]:
df['room_type'].value_counts().plot(kind='pie',autopct='%.2f')
#df['neighbourhood_group'].value_counts().plot(kind='pie',autopct='%.2f')
Out[34]:
<Axes: ylabel='count'>
In [35]:
df['neighbourhood_group'].value_counts().plot(kind='pie',autopct='%.2f')
Out[35]:
<Axes: ylabel='count'>
Bivariate analysis¶
In [36]:
sns.scatterplot(data=df, x='price', y='minimum_nights')
plt.title('Price vs Minimum Nights')
plt.show()
In [37]:
sns.scatterplot(data=df, x='price', y='number_of_reviews')
plt.title('Price vs number_of_reviews')
plt.show()
In [38]:
sns.scatterplot(data=df, y='price', x='calculated_host_listings_count')
plt.title('Price vs calculated_host_listings_count')
plt.show()
In [39]:
sns.scatterplot(data=df, y='price', x='reviews_per_month')
plt.title('Price vs reviews_per_month')
plt.show()
In [40]:
sns.scatterplot(data=df, y='price', x='availability_365')
plt.title('Price vs availability_365')
plt.show()
In [41]:
nyc=df.drop(columns=['host_name','last_review','neighbourhood'])
# Convert categorical columns to numeric using label encoding
from sklearn.preprocessing import LabelEncoder
# Label encoding for neighbourhood_group
le_neighbourhood = LabelEncoder()
nyc['neighbourhood_group'] = le_neighbourhood.fit_transform(nyc['neighbourhood_group'])
# Label encoding for room_type
le_room_type = LabelEncoder()
nyc['room_type'] = le_room_type.fit_transform(nyc['room_type'])
# Calculate correlation matrix
numeric_columns = nyc.select_dtypes(include=[np.number]) # Select only numeric columns
correlation_matrix = numeric_columns.corr()
#print(correlation_matrix)
#heatmap
plt.figure(figsize=(10, 8)) # Adjust size as needed
sns.heatmap(correlation_matrix,
annot=True, # Annotate with correlation coefficients
fmt=".2f", # Format for annotations
cmap='viridis', # Colormap
square=True, # Make each cell square
cbar_kws={"shrink": 0.8}) # Shrink the color bar
plt.title("Correlation Matrix Heatmap")
plt.show()
In [42]:
df.groupby('neighbourhood_group')['price'].mean().plot(kind='bar', color='lightblue')
plt.title('Average Price by Neighbourhood Group')
plt.ylabel('Average Price')
plt.show()
In [43]:
sns.boxplot(data=df, x='room_type', y='price')
plt.title('Price Distribution by Room Type')
plt.show()
In [44]:
pd.crosstab(df['room_type'], df['neighbourhood_group']).plot(kind='bar', stacked=True)
plt.title('Room Type Distribution Across Neighbourhood Groups')
plt.show()
Multivariate analysis¶
In [45]:
sns.pairplot(df[['price', 'minimum_nights', 'availability_365', 'reviews_per_month']])
plt.show()
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
In [46]:
nyc_model = df.select_dtypes(include=[np.number]) # Select only numeric columns
corr = nyc_model.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='cividis')
plt.title('Correlation Between Variables')
plt.show()
In [47]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
features = df[['price', 'minimum_nights', 'availability_365']].dropna()
kmeans.fit(features)
features['cluster'] = kmeans.labels_
sns.scatterplot(x=features['price'], y=features['minimum_nights'], hue=features['cluster'])
plt.show()
In [48]:
df=df.drop(columns=['host_name','last_review','neighbourhood'])
# Convert categorical columns to numeric using label encoding
from sklearn.preprocessing import LabelEncoder
# Label encoding for neighbourhood_group
le_neighbourhood = LabelEncoder()
df['neighbourhood_group'] = le_neighbourhood.fit_transform(df['neighbourhood_group'])
# Label encoding for room_type
le_room_type = LabelEncoder()
df['room_type'] = le_room_type.fit_transform(df['room_type'])
In [49]:
# Step 2: Select features and the target variable
features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'availability_365','neighbourhood_group','room_type']
X = df[features]
y = df['price']
In [61]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 48858 entries, 0 to 48894 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 48858 non-null int64 1 name 48858 non-null object 2 host_id 48858 non-null int64 3 neighbourhood_group 48858 non-null int32 4 latitude 48858 non-null float64 5 longitude 48858 non-null float64 6 room_type 48858 non-null int32 7 price 48858 non-null int64 8 minimum_nights 48858 non-null int64 9 number_of_reviews 48858 non-null int64 10 reviews_per_month 48858 non-null float64 11 calculated_host_listings_count 48858 non-null int64 12 availability_365 48858 non-null int64 dtypes: float64(3), int32(2), int64(7), object(1) memory usage: 4.8+ MB
In [65]:
df.drop('name',axis=1,inplace=True)
STANDARDIZATION¶
In [66]:
sc=StandardScaler()
df=sc.fit_transform(df)
In [67]:
# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [68]:
# Step 4: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
Out[68]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [69]:
# Step 5: Generate the residual plot
y_pred = model.predict(X_test)
residuals = y_test - y_pred
plt.figure(figsize=(15, 7))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0,color='red', linestyle='--' , linewidth=1.2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
In [70]:
## To assess multicollinearity, we calculate the Variance Inflation Factor (VIF) for each feature.
# Step 6: Calculate Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
In [71]:
print("Variance Inflation Factor (VIF):")
print(vif_data)
Variance Inflation Factor (VIF):
Feature VIF
0 latitude 473038.081597
1 longitude 471977.141989
2 minimum_nights 1.167698
3 number_of_reviews 1.327913
4 availability_365 1.850077
5 neighbourhood_group 6.794383
6 room_type 1.880712
In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
In [73]:
# Define the model
model = RandomForestRegressor(random_state=42)
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
In [58]:
# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
# Best Parameters
print("Best Parameters:", grid_search.best_params_)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
In [59]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
In [82]:
import warnings
import time
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from ydata_profiling import ProfileReport
#pip install ydata-profiling
MODEL BUILDING¶
In [83]:
models = [('LR', LinearRegression()),
("Ridge", Ridge()),
("Lasso", Lasso()),
("ElasticNet", ElasticNet()),
('KNN', KNeighborsRegressor()),
('CART', DecisionTreeRegressor()),
('RF', RandomForestRegressor()),
#('SVR', SVR()),
('GBM', GradientBoostingRegressor()),
("XGBoost", XGBRegressor(objective='reg:squarederror'))]
rmse_scores = []
r2_scores = []
mae_scores = []
mse_scores = []
execution_times = []
for name, regressor in models:
start_time = time.time()
# Fit the model
regressor.fit(X_train, y_train)
# Make predictions
y_pred = regressor.predict(X_test)
# Calculate RMSE
rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse_scores.append(rmse)
# Calculate R^2 score
r2 = metrics.r2_score(y_test, y_pred)
r2_scores.append(r2)
# Calculate MAE
mae = metrics.mean_absolute_error(y_test, y_pred)
mae_scores.append(mae)
# Calculate MSE
mse = metrics.mean_squared_error(y_test, y_pred)
mse_scores.append(mse)
# Calculate the execution time of the model
execution_time = time.time() - start_time
execution_times.append(execution_time)
print(f"RMSE: {round(rmse, 4)} ({name})")
print(f"R^2 Score: {round(r2, 4)} ({name})")
print(f"MAE: {round(mae, 4)} ({name})")
print(f"MSE: {round(mse, 4)} ({name})")
print(f"Execution Time: {round(execution_time, 2)} seconds\n")
# Plot RMSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], rmse_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("Model Performance (RMSE)")
plt.show()
# Plot R^2 scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], r2_scores)
plt.xlabel("Model")
plt.ylabel("R^2 Score")
plt.title("Model Performance (R^2 Score)")
plt.show()
# Plot MAE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mae_scores)
plt.xlabel("Model")
plt.ylabel("MAE")
plt.title("Model Performance (MAE)")
plt.show()
# Plot MSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mse_scores)
plt.xlabel("Model")
plt.ylabel("MSE")
plt.title("Model Performance (MSE)")
plt.show()
# Plot execution times
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], execution_times)
plt.xlabel("Execution Time (seconds)")
plt.ylabel("Model")
plt.title("Execution Times for Different Models")
plt.show()
RMSE: 227.9741 (LR) R^2 Score: 0.0925 (LR) MAE: 73.5269 (LR) MSE: 45800.715 (LR) Execution Time: 0.22 seconds RMSE: 227.9735 (Ridge) R^2 Score: 0.0925 (Ridge) MAE: 73.4931 (Ridge) MSE: 45798.9203 (Ridge) Execution Time: 0.11 seconds RMSE: 229.2451 (Lasso) R^2 Score: 0.0838 (Lasso) MAE: 74.5979 (Lasso) MSE: 46236.5532 (Lasso) Execution Time: 0.11 seconds RMSE: 232.9166 (ElasticNet) R^2 Score: 0.0541 (ElasticNet) MAE: 81.2278 (ElasticNet) MSE: 47734.9972 (ElasticNet) Execution Time: 0.11 seconds RMSE: 252.8757 (KNN) R^2 Score: -0.1436 (KNN) MAE: 88.7902 (KNN) MSE: 57715.8617 (KNN) Execution Time: 2.0 seconds RMSE: 348.8585 (CART) R^2 Score: -1.0991 (CART) MAE: 88.899 (CART) MSE: 105935.0795 (CART) Execution Time: 2.61 seconds RMSE: 239.1098 (RF) R^2 Score: -0.0313 (RF) MAE: 69.6856 (RF) MSE: 52045.4296 (RF) Execution Time: 168.63 seconds RMSE: 224.9954 (GBM) R^2 Score: 0.1348 (GBM) MAE: 66.5429 (GBM) MSE: 43664.9368 (GBM) Execution Time: 29.19 seconds RMSE: 237.2593 (XGBoost) R^2 Score: 0.039 (XGBoost) MAE: 70.0264 (XGBoost) MSE: 48497.3853 (XGBoost) Execution Time: 1.49 seconds
HYPERPARAMETER TUNING¶
In [84]:
# Initialize the models
models = [('LR', LinearRegression()),
("Ridge", Ridge()),
("Lasso", Lasso()),
("ElasticNet", ElasticNet()),
('KNN', KNeighborsRegressor()),
('CART', DecisionTreeRegressor()),
('RF', RandomForestRegressor()),
#('SVR', SVR()),
('GBM', GradientBoostingRegressor()),
("XGBoost", XGBRegressor(objective='reg:squarederror'))]
# Initialize lists to store metrics
rmse_scores = []
r2_scores = []
mae_scores = []
mse_scores = []
execution_times = []
# Define the hyperparameters for each model
param_grids = {
'LR': {},
'Ridge': {'alpha': [0.1, 1.0]},
'Lasso': {'alpha': [0.1, 1.0]},
'ElasticNet': {'alpha': [0.1, 1.0], 'l1_ratio': [0.1, 0.9]},
'KNN': {'n_neighbors': [3, 5]},
'CART': {'max_depth': [None, 10], 'min_samples_leaf': [1, 2]},
'RF': {'n_estimators': [10, 50], 'max_depth': [None, 10]},
'GBM': {'n_estimators': [10, 50], 'learning_rate': [0.01, 0.1]},
'XGBoost': {'n_estimators': [10, 50], 'learning_rate': [0.01, 0.1]}}
# Train and evaluate the models with hyperparameter tuning
for name, regressor in models:
print(f"Hyperparameter Tuning for {name}:")
start_time = time.time()
if param_grids[name]:
grid_search = GridSearchCV(regressor, param_grid=param_grids[name], cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
else:
best_model = regressor.fit(X_train, y_train)
# Make predictions
y_pred = best_model.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_scores.append(rmse)
# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
r2_scores.append(r2)
# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
mae_scores.append(mae)
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
mse_scores.append(mse)
# Calculate the execution time of the model
execution_time = time.time() - start_time
execution_times.append(execution_time)
print(f"RMSE: {round(rmse, 4)} ({name})")
print(f"R^2 Score: {round(r2, 4)} ({name})")
print(f"MAE: {round(mae, 4)} ({name})")
print(f"MSE: {round(mse, 4)} ({name})")
print(f"Execution Time: {round(execution_time, 2)} seconds\n")
# Plot RMSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], rmse_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("Model Performance (RMSE)")
plt.show()
# Plot R^2 scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], r2_scores)
plt.xlabel("Model")
plt.ylabel("R^2 Score")
plt.title("Model Performance (R^2 Score)")
plt.show()
# Plot MAE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mae_scores)
plt.xlabel("Model")
plt.ylabel("MAE")
plt.title("Model Performance (MAE)")
plt.show()
# Plot MSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mse_scores)
plt.xlabel("Model")
plt.ylabel("MSE")
plt.title("Model Performance (MSE)")
plt.show()
# Plot execution times
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], execution_times)
plt.xlabel("Execution Time (seconds)")
plt.ylabel("Model")
plt.title("Execution Times for Different Models")
plt.show()
Hyperparameter Tuning for LR:
RMSE: 214.011 (LR)
R^2 Score: 0.0925 (LR)
MAE: 73.5269 (LR)
MSE: 45800.715 (LR)
Execution Time: 0.02 seconds
Hyperparameter Tuning for Ridge:
Best parameters: {'alpha': 0.1}
RMSE: 214.0106 (Ridge)
R^2 Score: 0.0925 (Ridge)
MAE: 73.5234 (Ridge)
MSE: 45800.5211 (Ridge)
Execution Time: 6.35 seconds
Hyperparameter Tuning for Lasso:
Best parameters: {'alpha': 0.1}
RMSE: 213.9908 (Lasso)
R^2 Score: 0.0926 (Lasso)
MAE: 73.2759 (Lasso)
MSE: 45792.0777 (Lasso)
Execution Time: 0.25 seconds
Hyperparameter Tuning for ElasticNet:
Best parameters: {'alpha': 0.1, 'l1_ratio': 0.9}
RMSE: 215.1073 (ElasticNet)
R^2 Score: 0.0831 (ElasticNet)
MAE: 74.7703 (ElasticNet)
MSE: 46271.1676 (ElasticNet)
Execution Time: 0.47 seconds
Hyperparameter Tuning for KNN:
Best parameters: {'n_neighbors': 5}
RMSE: 240.2413 (KNN)
R^2 Score: -0.1436 (KNN)
MAE: 88.7902 (KNN)
MSE: 57715.8617 (KNN)
Execution Time: 1.69 seconds
Hyperparameter Tuning for CART:
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2}
RMSE: 239.1553 (CART)
R^2 Score: -0.1333 (CART)
MAE: 68.4253 (CART)
MSE: 57195.2403 (CART)
Execution Time: 1.53 seconds
Hyperparameter Tuning for RF:
Best parameters: {'max_depth': 10, 'n_estimators': 50}
RMSE: 220.8294 (RF)
R^2 Score: 0.0337 (RF)
MAE: 67.5705 (RF)
MSE: 48765.6126 (RF)
Execution Time: 29.89 seconds
Hyperparameter Tuning for GBM:
Best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
RMSE: 208.7253 (GBM)
R^2 Score: 0.1367 (GBM)
MAE: 66.4528 (GBM)
MSE: 43566.2706 (GBM)
Execution Time: 10.76 seconds
Hyperparameter Tuning for XGBoost:
Best parameters: {'learning_rate': 0.1, 'n_estimators': 10}
RMSE: 214.8463 (XGBoost)
R^2 Score: 0.0854 (XGBoost)
MAE: 70.1504 (XGBoost)
MSE: 46158.9199 (XGBoost)
Execution Time: 1.56 seconds
BEST MODEL¶
In [85]:
best_model
Out[85]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=10, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=10, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In [86]:
# Final Prediction Model
final_model = best_model
# Make predictions on the test set using the final model
y_final_pred = final_model.predict(X_test)
final_y_pred = (y_final_pred)
final_y_test =(y_test)
In [87]:
# Create a DataFrame with the predicted prices and true prices
results = pd.DataFrame({'Predicted Price': final_y_pred, 'True Price': final_y_test})
# Calculate the difference between the true prices and predicted prices and add a new column
results['Difference'] = results['True Price'] - results['Predicted Price']
# Display the results
print(results)
Predicted Price True Price Difference 5880 177.967026 140 -37.967026 35926 236.515106 399 162.484894 16413 143.900497 117 -26.900497 23347 97.397881 25 -72.397881 2531 222.163651 145 -77.163651 ... ... ... ... 9154 189.075378 177 -12.075378 29973 94.300438 75 -19.300438 10901 98.460770 100 1.539230 4182 93.681686 71 -22.681686 43095 125.407310 70 -55.407310 [9772 rows x 3 columns]
FEATURE IMPORTANCE¶
In [90]:
def plot_importance(model, features, num=50, save=False):
feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
plt.figure(figsize=(8, 10))
sns.set(font_scale=1)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
ascending=False)[0:num])
plt.title('Features')
plt.tight_layout()
plt.show(block=True)
if save:
plt.savefig('importances.png')
plot_importance(final_model, X)